library(gtools)
library(ellipse)
## 
## Attaching package: 'ellipse'
## The following object is masked from 'package:graphics':
## 
##     pairs
library(rstan)
## Loading required package: StanHeaders
## Loading required package: ggplot2
## rstan (Version 2.21.2, GitRev: 2e1f913d3ca3)
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
library(mclust)
## Package 'mclust' version 5.4.6
## Type 'citation("mclust")' for citing this R package in publications.
library(tictoc)
require(plyr)
## Loading required package: plyr
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rstan)
library(mvtnorm)
## 
## Attaching package: 'mvtnorm'
## The following object is masked from 'package:mclust':
## 
##     dmvnorm
#library(ks)
#library(weights)
library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.
## 
## Attaching package: 'mgcv'
## The following object is masked from 'package:mclust':
## 
##     mvn
## The following object is masked from 'package:gtools':
## 
##     scat
library(RColorBrewer)

RUN_STAN = F # if TRUE this runs the model on individual patient probabilities - takes about 10 hours
options(mc.cores = parallel::detectCores()-1)

version
##                _                           
## platform       x86_64-apple-darwin17.0     
## arch           x86_64                      
## os             darwin17.0                  
## system         x86_64, darwin17.0          
## status                                     
## major          4                           
## minor          0.2                         
## year           2020                        
## month          06                          
## day            22                          
## svn rev        78730                       
## language       R                           
## version.string R version 4.0.2 (2020-06-22)
## nickname       Taking Off Again
sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] RColorBrewer_1.1-2   mgcv_1.8-31          nlme_3.1-148        
##  [4] mvtnorm_1.1-1        dplyr_1.0.0          plyr_1.8.6          
##  [7] tictoc_1.0           mclust_5.4.6         rstan_2.21.2        
## [10] ggplot2_3.3.2        StanHeaders_2.21.0-5 ellipse_0.4.2       
## [13] gtools_3.8.2        
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.1.0   xfun_0.15          purrr_0.3.4        splines_4.0.2     
##  [5] lattice_0.20-41    V8_3.2.0           colorspace_1.4-1   vctrs_0.3.2       
##  [9] generics_0.0.2     htmltools_0.5.0    stats4_4.0.2       loo_2.3.1         
## [13] yaml_2.2.1         rlang_0.4.7        pkgbuild_1.1.0     pillar_1.4.6      
## [17] glue_1.4.1         withr_2.2.0        matrixStats_0.56.0 lifecycle_0.2.0   
## [21] stringr_1.4.0      munsell_0.5.0      gtable_0.3.0       codetools_0.2-16  
## [25] evaluate_0.14      inline_0.3.15      knitr_1.29         callr_3.4.3       
## [29] ps_1.3.3           curl_4.3           parallel_4.0.2     fansi_0.4.1       
## [33] Rcpp_1.0.5.1       scales_1.1.1       RcppParallel_5.0.2 jsonlite_1.7.0    
## [37] gridExtra_2.3      digest_0.6.25      stringi_1.4.6      processx_3.4.3    
## [41] grid_4.0.2         cli_2.0.2          tools_4.0.2        magrittr_1.5      
## [45] tibble_3.0.3       crayon_1.3.4       pkgconfig_2.0.3    Matrix_1.2-18     
## [49] ellipsis_0.3.1     prettyunits_1.1.1  assertthat_0.2.1   rmarkdown_2.3     
## [53] R6_2.4.1           compiler_4.0.2

Load curated dataset

load('Inputs/curated_modelling_dataset.RData')
# load imputed datasets
load('Inputs/imputed_Vietam_minimal.RData')

Reference Model P(Data | Severe malaria)

Figure 1

Model_mu = colMeans(dat_Training)
Model_sigma = cov(dat_Training)
print(dim(dat_Training))
## [1] 1704    2
mycols = adjustcolor(RColorBrewer::brewer.pal(name = 'Set2',n = 4),.3)[c(1,4)]
# AQ Vietnam data and Kenyan data overlaid
par(las=1,mfrow=c(1,2),bty='n', family='serif', 
    cex.lab=1.3, cex.axis=1.3, mar = c(5,7,4,2))
axis_titles = c('Platelet count (x1000 per uL)',
                'White count (x1000 per uL)')
axis_points = list(1:3, 0:2)
my_contours = c(0.5, .9)
panels = c('A','B')
dat_kenya$platelets_log10 = log10(dat_kenya$platelet)
dat_kenya$wbc_log10 = log10(dat_kenya$wbc)

plotting_dat = rbind(dat_Training, dat_kenya[,c("platelets_log10",
                                                "wbc_log10") ])
study = c(rep(1, nrow(dat_Training)), rep(2, nrow(dat_kenya)))
ind_rand = sample(1:nrow(plotting_dat), nrow(plotting_dat), replace = F)
plot(plotting_dat[ind_rand,1],plotting_dat[ind_rand,2], 
     main='', xaxt='n',yaxt='n',pch=c(17,15)[study[ind_rand]], 
     panel.first=grid(), xlim = c(.9, 3.1),
     col = mycols[study[ind_rand]], xlab = '',ylab = '')
mtext(text = axis_titles[1], side = 1, line = 3, 
      cex.lab=1.3, cex.axis=1.3,cex = 1.3)
mtext(text = axis_titles[2], side = 2, line = 3,
      las=3, cex.lab=1.3, cex.axis=1.3,cex = 1.3)

points(dat_kenya[dat_kenya$HbAS==1, "platelets_log10"], 
       dat_kenya[dat_kenya$HbAS==1, "wbc_log10"],
       col='black',pch=18)
axis(side = 1, at = axis_points[[1]], 
     labels = 10^axis_points[[1]])
axis(side = 2, at = axis_points[[2]], 
     labels = 10^axis_points[[2]])
axis(side = 1, at = log10(c(seq(10,100,by=10), seq(100,1000,by=100))), 
     tick = T, labels = F)
axis(side = 2, at = log10(c(seq(1,10,by=1), seq(10,100,by=10))), 
     tick = T, labels = F)

for(qq in my_contours){
  lines(ellipse(x = Model_sigma, centre = Model_mu, 
                level = qq), lty=2,lwd=3,col='darkgreen')
  
  lines(ellipse(x = cov(dat_kenya[,c("platelets_log10","wbc_log10")],use = 'comp'), 
                centre = colMeans(dat_kenya[,c("platelets_log10","wbc_log10")],na.rm = T), 
                level = qq), lty=2,lwd=3,col='purple')
}
mtext(text = panels[1], side = 3, line = 1, adj = 0, cex=1.5)
legend('bottomright',inset = 0.02,cex=.9, 
       legend = c('Training data','Kenyan data','HbAS (Kenyan)'),
       col = c(mycols,'black'), pch=c(17,15,18))


############################################################
## ************** Platelets versus PfHRP2 **************
dat_feast_hrp2 = read.csv('Inputs/FEAST_platelets_hrp2.csv')
## Values less than or equal to 1 ng/mL are set to 1ng/mL with random jitter for plotting purposes
dat_feast_hrp2$hrp2[dat_feast_hrp2$hrp2<=1]= 10^rnorm(sum(dat_feast_hrp2$hrp2<=1), 0,0.05)
ind = !is.na(dat_core$hrp2) & dat_core$hrp2>0
mycoefs= coef(MASS::rlm(log10(dat_core$hrp2[ind])~ log10(dat_core$platelets[ind])))

mycols = c(brewer.pal(n = 9, 'Set1')[c(3,1)], 'black')
plot(log10(dat_core$platelets), 
     log10(dat_core$hrp2), xlim = c(.9, 3),panel.first = grid(),
     xlab='Platelet count (x1000) per uL', ylab='', xaxt='n', yaxt='n',
     ylim =c(-.1, 5), pch = 16, col=mycols[1], cex=1.2)
mtext(text = 'PfHRP2 (ng/mL)', side = 2, line = 4, las = 3, cex=1.3)

axis(2, at = 0:5, labels = 10^(0:5))
axis(side = 1, at = axis_points[[1]], 
     labels = 10^axis_points[[1]])
axis(side = 1, at = log10(c(seq(10,100,by=10), seq(100,1000,by=100))), 
     tick = T, labels = F)

# make index for blood slide +/-
dat_feast_hrp2$type=2
ind_0parasite = which(!is.na(dat_feast_hrp2$hrp2) & dat_feast_hrp2$parasitaemia==0)
dat_feast_hrp2$type[ind_0parasite] = 3
points(log10(dat_feast_hrp2$platelets), log10(dat_feast_hrp2$hrp2),
       pch=c(15,17)[(dat_feast_hrp2$type-1)], 
       col= adjustcolor(mycols[dat_feast_hrp2$type], .6), cex=1.2)

ind_spline = !is.na(dat_feast_hrp2$platelets) & dat_feast_hrp2$platelets>10
f=smooth.spline(y = log10(dat_feast_hrp2$hrp2[ind_spline]),
                x = log10(dat_feast_hrp2$platelets[ind_spline]))
lines(f, lwd=4, col='grey',lty=1)
lines(0:10, mycoefs[1]+mycoefs[2]*(0:10), lwd=4, col=mycols[1],lty=1)
legend('topright', legend = c('Bangladesh (adults)', 
                              'FEAST (+blood slide)', 
                              'FEAST (-blood slide)'), 
       col = c(mycols), pch=c(16,15,17), cex=.9)
mtext(text = 'B', side = 3, line = 1, adj = 0, cex=1.5)

writeLines(sprintf('Training data contains %s individuals', nrow(dat_Training)))
## Training data contains 1704 individuals

Figure for presentation

par(las=1,mfrow=c(1,1),bty='n', family='serif', 
    cex.lab=1.3, cex.axis=1.3, mar = c(5,7,4,2))

plot(log10(dat_core$platelets), 
     log10(dat_core$hrp2), xlim = c(.9, 3),
     xlab='Platelet count (x1000) per uL', ylab='', xaxt='n', yaxt='n',
     ylim =c(-.1, 5), pch = 16, col=mycols[1], cex=1.2)
mtext(text = 'PfHRP2 (ng/mL)', side = 2, line = 4, las = 3, cex=1.3)

axis(2, at = 0:5, labels = 10^(0:5))
axis(side = 1, at = axis_points[[1]], 
     labels = 10^axis_points[[1]])
axis(side = 1, at = log10(c(seq(10,100,by=10), seq(100,1000,by=100))), 
     tick = T, labels = F)


points(log10(dat_feast_hrp2$platelets), log10(dat_feast_hrp2$hrp2),
       pch=c(15,17)[(dat_feast_hrp2$type-1)], 
       col= adjustcolor(mycols[dat_feast_hrp2$type], .6), cex=1.2)

lines(f, lwd=4, col='grey',lty=1)
lines(0:10, mycoefs[1]+mycoefs[2]*(0:10), lwd=4, col=mycols[1],lty=1)
abline(v = log10(200), lty=2)
legend('topright', legend = c('Bangladesh (adults)', 
                              'FEAST (+blood slide)', 
                              'FEAST (-blood slide)'), 
       col = c(mycols), pch=c(16,15,17), cex=.9)
mtext(text = 'B', side = 3, line = 1, adj = 0, cex=1.5)

mycols = adjustcolor(RColorBrewer::brewer.pal(name = 'Set2',n = 4),.6)[c(1,4)]
# AQ Vietnam data and Kenyan data overlaid without classification colors
panels = c('','')

## Just training data
plot(dat_Training[,1],dat_Training[,2], 
     main='', xaxt='n',yaxt='n',pch=17, xlim = c(.9, 3.1),
     panel.first=grid(),
     ylim = range(plotting_dat[,2], na.rm = T),
     col = mycols[1], xlab = '',ylab = '')
mtext(text = axis_titles[1], side = 1, line = 3, 
      cex.lab=1.3, cex.axis=1.3,cex = 1.3)
mtext(text = axis_titles[2], side = 2, line = 3,
      las=3, cex.lab=1.3, cex.axis=1.3,cex = 1.3)

axis(side = 1, at = axis_points[[1]], 
     labels = 10^axis_points[[1]])
axis(side = 2, at = axis_points[[2]], 
     labels = 10^axis_points[[2]])
axis(side = 1, at = log10(c(seq(10,100,by=10), seq(100,1000,by=100))), 
     tick = T, labels = F)
axis(side = 2, at = log10(c(seq(1,10,by=1), seq(10,100,by=10))), 
     tick = T, labels = F)

for(qq in my_contours){
  lines(ellipse(x = Model_sigma, centre = Model_mu, 
                level = qq), lty=2,lwd=3,col='darkgreen')
}

# Training and testing data - imputed and age corrected
mycols = adjustcolor(RColorBrewer::brewer.pal(name = 'Set2',n = 4),.4)[c(1,4)]

plot(plotting_dat[ind_rand,1],plotting_dat[ind_rand,2], 
     main='', xaxt='n',yaxt='n',pch=c(17,15)[study[ind_rand]], 
     panel.first=grid(),xlim = c(.9, 3.1),
     col = mycols[study[ind_rand]], xlab = '',ylab = '')
mtext(text = axis_titles[1], side = 1, line = 3, 
      cex.lab=1.3, cex.axis=1.3,cex = 1.3)
mtext(text = axis_titles[2], side = 2, line = 3,
      las=3, cex.lab=1.3, cex.axis=1.3,cex = 1.3)

points(dat_kenya$log10_platelet_imputed[dat_kenya$HbAS==1], 
       dat_kenya$log10_wbc_imputed_corrected[dat_kenya$HbAS==1],
       col='black',pch=18)
axis(side = 1, at = axis_points[[1]], 
     labels = 10^axis_points[[1]])
axis(side = 2, at = axis_points[[2]], 
     labels = 10^axis_points[[2]])
axis(side = 1, at = log10(c(seq(10,100,by=10), seq(100,1000,by=100))), 
     tick = T, labels = F)
axis(side = 2, at = log10(c(seq(1,10,by=1), seq(10,100,by=10))), 
     tick = T, labels = F)

for(qq in my_contours){
  lines(ellipse(x = Model_sigma, centre = Model_mu, 
                level = qq), lty=2,lwd=3,col='darkgreen')
  
  lines(ellipse(x = cov(dat_kenya[,c('log10_platelet_imputed',
                                     'log10_wbc_imputed_corrected')]), 
                centre = colMeans(dat_kenya[,c('log10_platelet_imputed',
                                               'log10_wbc_imputed_corrected')]), 
                level = qq), lty=2,lwd=3,col='purple')
}
legend('bottomright',inset = 0.02,cex=1.1, 
       legend = c('Training data','Kenyan data','HbAS (Kenyan)'),
       col = c(mycols,'black'), pch=c(17,15,18))

par(mfrow=c(1,1),las=1)
dat_feast_hrp2$SM = as.numeric(dat_feast_hrp2$hrp2>=1000)
mod = mgcv::gam(SM ~ s(log(platelets)), family='binomial', data=dat_feast_hrp2)
summary(mod)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## SM ~ s(log(platelets))
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.1273     0.1473  -7.652 1.99e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                     edf Ref.df Chi.sq p-value    
## s(log(platelets)) 3.603  4.503  141.6  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.369   Deviance explained =   32%
## UBRE = -0.11596  Scale est. = 1         n = 567
xs = seq(1,3, length.out = 1000)
ys=predict(mod, newdata = data.frame(platelets=10^xs), se.fit = T)
uppers = inv.logit(ys$fit + 1.96*ys$se.fit)
lowers = inv.logit(ys$fit - 1.96*ys$se.fit)
plot(xs, inv.logit(ys$fit), ylim = c(0,1),
     main = 'Severely ill African children (FEAST)',
     type='l', ylab='Probability PfHRP2>1000 ng/mL', 
     panel.first=grid(),
     xlab='Platelet count', xaxt='n')
axis(side = 1, at = axis_points[[1]], 
     labels = 10^axis_points[[1]])
axis(side = 1, at = log10(c(seq(10,100,by=10), seq(100,1000,by=100))), 
     tick = T, labels = F)
polygon(c(xs, rev(xs)), c(uppers, rev(lowers)), border = NA, col = adjustcolor('grey', .2))

dat_feast_hrp2$parasites = as.numeric(dat_feast_hrp2$parasitaemia>=10000)
mod = mgcv::gam(parasites ~ s(log(platelets)), 
                family='binomial', data=dat_feast_hrp2)
summary(mod)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## parasites ~ s(log(platelets))
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.6246     0.1281  -4.877 1.08e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                     edf Ref.df Chi.sq  p-value    
## s(log(platelets)) 3.952  4.916  79.72 1.06e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.237   Deviance explained = 20.3%
## UBRE = 0.093997  Scale est. = 1         n = 433
xs = seq(1,3, length.out = 1000)
ys=predict(mod, newdata = data.frame(platelets=10^xs), se.fit = T)
uppers = inv.logit(ys$fit + 1.96*ys$se.fit)
lowers = inv.logit(ys$fit - 1.96*ys$se.fit)
plot(xs, inv.logit(ys$fit), ylim = c(0,1),
     main = 'Severely ill african children (FEAST)',
     type='l', ylab='Probability >10,000 parasites/uL', panel.first=grid(),
     xlab='Platelet count', xaxt='n')
axis(side = 1, at = axis_points[[1]], 
     labels = 10^axis_points[[1]])
axis(side = 1, at = log10(c(seq(10,100,by=10), seq(100,1000,by=100))), 
     tick = T, labels = F)
polygon(c(xs, rev(xs)), c(uppers, rev(lowers)), border = NA, col = adjustcolor('grey', .2))

Estimating P(Severe malaria) in the Kenyan cohort

Full model

biomarker_vars = c('log10_platelet_imputed',
                   'log10_wbc_imputed_corrected')
contamination_model = stan_model(file = 'contamination_model.stan')
## Trying to compile a simple C file
## Running /Library/Frameworks/R.framework/Resources/bin/R CMD SHLIB foo.c
## clang -mmacosx-version-min=10.13 -I"/Library/Frameworks/R.framework/Resources/include" -DNDEBUG   -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/Rcpp/include/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/unsupported"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/BH/include" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/src/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppParallel/include/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/rstan/include" -DEIGEN_NO_DEBUG  -DBOOST_DISABLE_ASSERTS  -DBOOST_PENDING_INTEGER_LOG2_HPP  -DSTAN_THREADS  -DBOOST_NO_AUTO_PTR  -include '/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp'  -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1   -I/usr/local/include   -fPIC  -Wall -g -O2  -c foo.c -o foo.o
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Dense:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Core:88:
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:613:1: error: unknown type name 'namespace'
## namespace Eigen {
## ^
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:613:16: error: expected ';' after top level declarator
## namespace Eigen {
##                ^
##                ;
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Dense:1:
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Core:96:10: fatal error: 'complex' file not found
## #include <complex>
##          ^~~~~~~~~
## 3 errors generated.
## make: *** [foo.o] Error 1
dat_kenya$Ref_Model_Likelihood = dmvt(x = dat_kenya[, biomarker_vars], 
                                      delta = Model_mu, 
                                      sigma = Model_sigma, 
                                      df = 7)
hist(dat_kenya$Ref_Model_Likelihood, 
     xlab='Likelihood under the reference model',main='')

threshold_100_percent = quantile(dat_kenya$Ref_Model_Likelihood,probs=0.8)

ind = !is.na(dat_kenya$HbAS)

# make a sub data set with available HbAS - which we sort by Likelihood value
kemri_sickle_data = data.frame(sickle_trait=dat_kenya$HbAS[ind],
                               lik = dat_kenya$Ref_Model_Likelihood[ind])
kemri_sickle_data = kemri_sickle_data[order(kemri_sickle_data$lik), ]
N = nrow(kemri_sickle_data)

# in the top quintile
N2 = sum(kemri_sickle_data$lik>threshold_100_percent)
print(N2)
## [1] 443
N2_sickle = sum(kemri_sickle_data$sickle_trait[(N-N2+1):N])
print(N2_sickle)
## [1] 4
Nunknown = N-N2
print(Nunknown)
## [1] 1770
Nunknown_sickle = sum(kemri_sickle_data$sickle_trait[1:(N-N2)])
print(Nunknown_sickle)
## [1] 53
# Run Bayesian mode
out_mod=sampling(object = contamination_model, 
                 data = list(Nunknown = Nunknown, 
                             Nunknown_sickle = Nunknown_sickle, 
                             N1 = 6748,
                             N1_sickle = 364,
                             N2 = N2,
                             N2_sickle = N2_sickle,
                             b1 = 5, b2=95, a1=1, a2=99), iter=10^5)
## Warning: There were 1 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Warning: Examine the pairs() plot to diagnose sampling problems
thetas_contam = extract(out_mod)
print(quantile(1-thetas_contam$q, probs = c(0.025, .5, .975)))
##      2.5%       50%     97.5% 
## 0.3107815 0.5281657 0.7656431
print(100*mean(thetas_contam$p1))
## [1] 5.370524
print(100*mean(thetas_contam$p2))
## [1] 0.9599943
P_SM_main = 1-mean(thetas_contam$q) * (N-N2) / N

est_SM = 1-(quantile(thetas_contam$q * (N-N2), 
                     probs = rev(c(0.025, .1, .5, .9, .975)))/N)
est_SM = round(100*est_SM)
writeLines(sprintf('The proportion of individuals in the dataset with severe malaria is %s%% (95%% credible interval %s-%s)',
                   est_SM[3], est_SM[1], est_SM[5]))
## The proportion of individuals in the dataset with severe malaria is 62% (95% credible interval 45-81)

Estimating individual probabilities P(Severe malaria | Data)

Priors

source('stan_mixture_model_student2.R')
## Trying to compile a simple C file
## Running /Library/Frameworks/R.framework/Resources/bin/R CMD SHLIB foo.c
## clang -mmacosx-version-min=10.13 -I"/Library/Frameworks/R.framework/Resources/include" -DNDEBUG   -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/Rcpp/include/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/unsupported"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/BH/include" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/src/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppParallel/include/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/rstan/include" -DEIGEN_NO_DEBUG  -DBOOST_DISABLE_ASSERTS  -DBOOST_PENDING_INTEGER_LOG2_HPP  -DSTAN_THREADS  -DBOOST_NO_AUTO_PTR  -include '/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp'  -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1   -I/usr/local/include   -fPIC  -Wall -g -O2  -c foo.c -o foo.o
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Dense:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Core:88:
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:613:1: error: unknown type name 'namespace'
## namespace Eigen {
## ^
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:613:16: error: expected ';' after top level declarator
## namespace Eigen {
##                ^
##                ;
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Dense:1:
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Core:96:10: fatal error: 'complex' file not found
## #include <complex>
##          ^~~~~~~~~
## 3 errors generated.
## make: *** [foo.o] Error 1
mu_priorSM = c(1.8, .95)
mu_prior_notSM = c(2.5, 1.5)
sd_prior_notSM = c(.25, .25)
sd_priorSM = c(0.1, 0.1)
prior_beta1 = .65*est_SM[3]
prior_beta2 = .65*(100-est_SM[3])

hist(rbeta(10^5, prior_beta1, prior_beta2))

Penalised max-likelihood

Testing_data = dat_kenya[, biomarker_vars] # age-corrected wbc

k_imputed_dataset = 1
stan_data = list(D=2,
                 K_notSM=6,
                 N_Training_Africa = nrow(dat_feast_training),
                 N_Training_Asia = nrow(imputed_Vietam_minimal[[k_imputed_dataset]])+
                   nrow(dat_core),
                 N_Testing = nrow(Testing_data),
                 y_Training_Africa = dat_feast_training[,c("platelets_log10",
                                                           "wbc_log10_corrected")],
                 y_Training_Asia = rbind(imputed_Vietam_minimal[[1]][, c("platelets_log10",
                                                                         "wbc_log10")],
                                         dplyr::rename(dat_core[, c("platelets_log10",
                                                      "log10_wbc_corrected")],
                                                wbc_log10=log10_wbc_corrected)),
                 y_Testing = Testing_data,
                 mu_priorSM = mu_priorSM,
                 mu_prior_notSM = mu_prior_notSM,
                 sd_prior_notSM = sd_prior_notSM,
                 sd_priorSM = sd_priorSM,
                 prior_beta1 = prior_beta1,
                 prior_beta2 = prior_beta2)

out_mod_student_ML=
  optimizing(object = mix_model_student2, data = stan_data, seed = 6987)

round(out_mod_student_ML$par[grep('theta', names(out_mod_student_ML$par))],2)
##       theta_SM  theta_SMstudy theta_notSM[1] theta_notSM[2] theta_notSM[3] 
##           0.54           0.05           0.01           0.01           0.02 
## theta_notSM[4] theta_notSM[5] theta_notSM[6] 
##           0.61           0.01           0.33
round(10^out_mod_student_ML$par[grep('mu_', names(out_mod_student_ML$par))],1)
##         mu_SM[1]         mu_SM[2] mu_SM_study[1,1] mu_SM_study[2,1] 
##             68.6              8.4             57.7             83.5 
## mu_SM_study[1,2] mu_SM_study[2,2]    mu_notSM[1,1]    mu_notSM[2,1] 
##              8.2              8.5            248.6            163.1 
##    mu_notSM[3,1]    mu_notSM[4,1]    mu_notSM[5,1]    mu_notSM[6,1] 
##            459.3            126.5            373.0            264.2 
##    mu_notSM[1,2]    mu_notSM[2,2]    mu_notSM[3,2]    mu_notSM[4,2] 
##             22.0             39.0             30.0             18.3 
##    mu_notSM[5,2]    mu_notSM[6,2] 
##             19.6             10.8
round(10^out_mod_student_ML$par[grep('mu', names(out_mod_student_ML$par))])
##         mu_SM[1]         mu_SM[2] mu_SM_study[1,1] mu_SM_study[2,1] 
##               69                8               58               84 
## mu_SM_study[1,2] mu_SM_study[2,2]    mu_notSM[1,1]    mu_notSM[2,1] 
##                8                9              249              163 
##    mu_notSM[3,1]    mu_notSM[4,1]    mu_notSM[5,1]    mu_notSM[6,1] 
##              459              126              373              264 
##    mu_notSM[1,2]    mu_notSM[2,2]    mu_notSM[3,2]    mu_notSM[4,2] 
##               22               39               30               18 
##    mu_notSM[5,2]    mu_notSM[6,2]          sd_muSM 
##               20               11                1
ps1=out_mod_student_ML$par[grep('prob_SM', names(out_mod_student_ML$par))]
hist(ps1)

Full Bayes

# Takes about 9 hours to run the sampler
if(RUN_STAN){
  out_mod_student=sampling(object = mix_model_student2, 
                           data = stan_data, 
                           seed=29864826, chains=7)
  
  save(out_mod_student, file = 'Outputs/mixture_model_out.RData')
} else {
  load(file = 'Outputs/mixture_model_out.RData')
}
my_thetas = extract(out_mod_student)
traceplot(out_mod_student, pars=c('mu_SM','theta_SM','theta_SMstudy'))

plot(out_mod_student, pars=c('theta_SM', 'theta_SMstudy','theta_notSM'))
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)

plot(out_mod_student, pars=c('mu_SM','mu_SM_study','mu_notSM'))
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)

hist(my_thetas$theta_SM, freq = F, xlab='P(Severe malaria)', 
     yaxt='n', ylab = '',main='')
lines(seq(0,1,length.out = 100),
      dbeta(seq(0,1,length.out = 100),prior_beta1,prior_beta2),
      lwd=3,col='red')

median(my_thetas$theta_SM)
## [1] 0.5786242
ps_post = apply(my_thetas$prob_SM, 2, mean)
dat_kenya$P_SMs = ps_post

Predictive value of P(Severe malaria | Data)

Fit GAMS to likelihood values (quantiles of) with platelets/wbc/parasitaemia

par(las=1, bty='n', family='serif', mfrow=c(2,2),cex.lab=1.5, cex.axis=1.5)

hist(dat_kenya$P_SMs, breaks = 20, ylab = 'Number of patients',
     xlab = 'P(Severe malaria | Data)', main='')
mtext(text = 'A', side = 3, adj = 0, cex = 1.5)

f=colorRampPalette(colors = RColorBrewer::brewer.pal(name = 'RdYlBu',n = 11))
n_levels = 11
mycuts = cut(x = dat_kenya$P_SMs, breaks = seq(0,1,by=1/n_levels))
cols_ind = as.numeric(mycuts)
mycols = f(n_levels)
plot(log10(dat_kenya$platelet),log10(dat_kenya$wbc),
     xlab='Platelet count (x1000 per uL)', xaxt='n', yaxt='n',
     ylab='White count (x1000 per uL)', pch=16, panel.first=grid(),
     col = adjustcolor(mycols[cols_ind], .7))
ind_sickle=dat_kenya$HbAS==1
points(log10(dat_kenya$platelet)[ind_sickle],
       log10(dat_kenya$wbc)[ind_sickle], pch=18, cex=1.5)
axis_points = list(1:3, 0:2)
axis(side = 1, at = axis_points[[1]], 
     labels = 10^axis_points[[1]])
axis(side = 2, at = axis_points[[2]], 
     labels = 10^axis_points[[2]])
axis(side = 1, at = log10(c(seq(10,100,by=10), seq(100,1000,by=100))), 
     tick = T, labels = F)
axis(side = 2, at = log10(c(seq(1,10,by=1), seq(10,100,by=10))), 
     tick = T, labels = F)
lgd_ = rep(NA, n_levels)
lgd_[c(1,6,11)] = c(0,0.5,1)
legend('bottomright',
       legend = lgd_,
       fill = mycols,bty='n',x.intersp =.5,
       border = NA,inset = 0.01,
       y.intersp = 0.5,title = '',
       cex = 1, text.font = 1)
mtext(text = 'B', side = 3, adj = 0, cex = 1.5)



myx = dat_kenya$P_SMs
xs = seq(0,1,length.out = 100)[-1]

##******** SICKLE ****************
mod = gam(sickle_trait ~ s(x,k=4), family='binomial', 
          data=data.frame(x=myx, 
                          sickle_trait=dat_kenya$HbAS))
summary(mod)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## sickle_trait ~ s(x, k = 4)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.8356     0.1612  -23.79   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##        edf Ref.df Chi.sq  p-value    
## s(x) 1.002  1.004  19.96 8.11e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.00925   Deviance explained = 4.22%
## UBRE = -0.76896  Scale est. = 1         n = 2213
preds = predict(mod, se.fit = T, newdata = data.frame(x=xs))
ys = 100*c(inv.logit(preds$fit+1.96*preds$se.fit),
           rev(inv.logit(preds$fit-1.96*preds$se.fit)))
plot(xs, 100*inv.logit(preds$fit),type='l',lwd=3,
     xlim = c(0,1),
     xlab='P(Severe malaria | Data)',
     ylim = range(c(0,ys)), ylab='HbAS (%)')
polygon(c(xs, rev(xs)), ys,panel.first=grid(),
        border = NA, col = adjustcolor('grey',.3))
mtext(text = 'C', side = 3, adj = 0, cex = 1.5)
abline(h = 100*mean(dat_kenya$HbAS),lty=2, lwd=2)


writeLines('##******** DEATH ****************')
## ##******** DEATH ****************
mod = gam(died ~ s(x,k=4), family='binomial', 
          data=data.frame(x=myx, 
                          died=dat_kenya$died))
summary(mod)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## died ~ s(x, k = 4)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.10500    0.07083  -29.72   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##        edf Ref.df Chi.sq  p-value    
## s(x) 2.517  2.827  44.93 1.05e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0216   Deviance explained = 2.95%
## UBRE = -0.30028  Scale est. = 1         n = 2207
not_na_died = !is.na(dat_kenya$died)
preds = predict(mod, se.fit = T, newdata = data.frame(x=xs))
ys = 100*c(inv.logit(preds$fit+
                       1.96*preds$se.fit),
           rev(inv.logit(preds$fit-
                           1.96*preds$se.fit)))
plot(xs,100*inv.logit(preds$fit),type='l',lwd=3,
     xlim = c(0,1), ylim = range(c(0,ys)),
     xlab='P(Severe malaria | Data)',
     ylab='Mortality (%)')
polygon(c(xs, rev(xs)),ys,panel.first=grid(),
        border = NA, col = adjustcolor('grey',.3))
mtext(text = 'D', side = 3, adj = 0, cex = 1.5)
abline(h = 100*mean(dat_kenya$died, na.rm = T),
       lty=2, lwd=2)

Summarise the top and bottom 20th percentiles

upper_p = round(quantile(dat_kenya$P_SMs, probs=.8),3)
print(upper_p)
##   80% 
## 0.924
lower_p = round(quantile(dat_kenya$P_SMs, probs=.2),3)
print(lower_p)
##   20% 
## 0.183
top_ind = dat_kenya$P_SMs>upper_p
bottom_ind = dat_kenya$P_SMs<lower_p

sum(top_ind)
## [1] 446
sum(bottom_ind)
## [1] 444
sum(dat_kenya$HbAS[top_ind],na.rm = T)
## [1] 3
sum(dat_kenya$HbAS[bottom_ind],na.rm = T)
## [1] 21
writeLines(sprintf('For patients with P(SM|Data)>%s, %s%% have sickle trait versus %s%% for patients with P(SM|Data)< %s',
                   upper_p,
                   round(100*mean(dat_kenya$HbAS[top_ind],na.rm = T),1),
                   round(100*mean(dat_kenya$HbAS[bottom_ind],na.rm = T),1),
                   lower_p))
## For patients with P(SM|Data)>0.924, 0.7% have sickle trait versus 4.8% for patients with P(SM|Data)< 0.183
writeLines(sprintf('For patients with P(SM|Data)>%s, %s%% died versus %s%% for patients with P(SM|Data)< %s',
                   upper_p,
                   round(100*mean(dat_kenya$died[top_ind],na.rm = T),1),
                   round(100*mean(dat_kenya$died[bottom_ind],na.rm = T),1),
                   lower_p))
## For patients with P(SM|Data)>0.924, 6.1% died versus 18.8% for patients with P(SM|Data)< 0.183

Save output

ws_dat = data.frame(#IID = dat_kenya$IID,
                    ws = ps_post)
save(ws_dat, file = 'Outputs/patient_SM_weights.RData')
save(dat_kenya, file = 'Outputs/Kenya_dat_PSM.RData')